1 Research Questions

  • what exactly do people use rental bikes for?
  • where do people come from and where do they go?
  • further ideas for research questions: see rlang.io

2 Load Packages

require(tidyverse)
require(osmdata)
require(sf)
require(leaflet)

3 Read Data

stations_data <- data.table::fread("./data/bikeshare_stations_hh.csv")
trips_data <- data.table::fread("./data/bikeshare_trips_hh.csv")
landuse_sf_data <- readRDS("./data/landuse_sf.rds")

4 Merge Data Sets

start_stations <-
  merge(trips_data, stations_data, by.y = "station_id", by.x = "start_rental_zone_hal_id") %>% select(name, lat, lon)
colnames(start_stations) <-
  c("name_start_station",
    "lat_start_station",
    "lon_start_station")

end_stations <-
  merge(trips_data, stations_data, by.y = "station_id", by.x = "end_rental_zone_hal_id") %>% select(name, lat, lon)
colnames(end_stations) <-
  c("name_end_station", "lat_end_station", "lon_end_station")

trips_data_latlon <- cbind(trips_data, start_stations, end_stations)

5 Feature Engineering

require(geosphere)
# https://cran.r-project.org/web/packages/geosphere/vignettes/geosphere.pdf

get_time_of_day <- function(character_hhmm){
  x <- as.numeric(str_replace_all(character_hhmm, ":", ""))
  time_of_day <- ifelse(700 <= x & x < 1100, "Morgen", 
                        ifelse(1100 <= x & x < 1300, "Vormittag", 
                               ifelse(1300 <= x & x < 1500, "Mittag", 
                                      ifelse(1500 <= x & x < 1800, "Nachmittag", 
                                             ifelse(1800 <= x & x < 2200, "Abend", "Nacht")
                                             )
                                      )
                               )
                        )
  return(factor(time_of_day, levels = c("Morgen", "Vormittag", "Mittag", "Nachmittag", "Abend", "Nacht")))
}

get_dist <- function(lon_start, lat_start, lon_end, lat_end) {
  p_start = cbind(lon_start, lat_start)
  p_end = cbind(lon_end, lat_end)
  return(distGeo(p_start, p_end))
}

trips_data_fts <- trips_data_latlon %>% 
  mutate(roundtrip = ifelse(start_rental_zone_hal_id == end_rental_zone_hal_id, TRUE, FALSE),
         timediff_hours = difftime(datetime_to, datetime_from, units = "hours"),
         timediff_mins = difftime(datetime_to, datetime_from, units = "mins"), 
         time_of_day_start = get_time_of_day(hourmin_from), 
         time_of_day_end = get_time_of_day(hourmin_to), 
         distance_meters = get_dist(lon_start_station, lat_start_station, 
                                    lon_end_station, lat_end_station)) %>% 
  mutate(speed = ifelse(distance_meters==0, NA, as.numeric(timediff_mins)/distance_meters))

6 Some Descriptive Statistics On Stations and Trips

# count of start station
start_trip_counts <- trips_data_fts %>% group_by(start_rental_zone_hal_id) %>% 
  summarise(n = n()) %>%
  mutate(freq = n / sum(n))
start_trip_counts
# count of end station
end_trip_counts <- trips_data_fts %>% group_by(end_rental_zone_hal_id) %>% 
  summarise(n = n()) %>%
  mutate(freq = n / sum(n))
end_trip_counts
# count of roundtrips
trips_data_fts %>% 
group_by(roundtrip) %>%
  summarise(n = n()) %>%
  mutate(freq = n / sum(n))
# histogram on trip length
mw_timediff_mins <- signif(mean(as.numeric(trips_data_fts$timediff_mins)), digits = 6)
brks <- c(3,10,30,60,90,120,150,300)
trips_data_fts %>%
  ggplot(aes(as.numeric(timediff_mins))) +
  geom_histogram(bins = 200) + 
  geom_vline(xintercept = 30, col="black", lty="dashed") + 
  annotate("text", x = 30-1.5, y = 450, col="black", label="First 30 mins are free of charge", angle = 90) + 
  geom_vline(xintercept = mw_timediff_mins, col="red") + 
  annotate("text", x = mw_timediff_mins-1.5, y = 350, col="red", label="Mean TimeDiff per Trip", angle = 90) + 
  labs(title = paste0("Majority of trips are free of charge and mean trip duration is ", mw_timediff_mins, dig=2)) +
  scale_x_log10(name ="log trip duration [mins]", labels =  as.character(brks), breaks = brks) + 
  scale_y_continuous(n.breaks = 10)

# proportion of trips with duration smaller than mean trip duration
trips_data_fts %>% 
  mutate(smaller_mw = timediff_mins <= mw_timediff_mins) %>% 
  group_by(smaller_mw) %>% 
  summarize(n = n()) %>% 
  mutate(freq = n / sum(n))
# proportion of trips with duration smaller 30 mins
trips_data_fts %>% 
  mutate(smaller_30 = as.numeric(timediff_mins) <= 30) %>% 
  group_by(smaller_30) %>% 
  summarize(n = n()) %>% 
  mutate(freq = n / sum(n))
# count of trip-starts as per day and per time of day
starts_by_date <- trips_data_fts %>% 
  group_by(date_from, time_of_day_start) %>% 
  summarise(n = n()) %>% 
  ungroup() %>% 
  group_by(date_from) %>%  
  mutate(freq = n/sum(n)) 
starts_by_date
starts_by_date %>% 
  ggplot(aes(x = as.POSIXct(date_from), y = n, group = time_of_day_start, col = time_of_day_start)) + 
  geom_line() + 
  scale_x_datetime(name = "Start Date", date_breaks = "day", date_labels = "%Y-%m-%d (%a)") +
  theme(axis.text.x = element_text(angle = 30, hjust = 1)) + 
  scale_y_continuous(breaks = seq(0,3500, by = 500))

duration_by_date <-  trips_data_fts %>% 
  group_by(date_from, time_of_day_start) %>% 
  summarise(mean_duration = as.numeric(mean(timediff_mins)), 
            sd_duration = as.numeric(sd(timediff_mins))) %>% 
  ungroup() 
duration_by_date
duration_by_date %>% 
  ggplot(aes(x = as.POSIXct(date_from), y = mean_duration, group = time_of_day_start, col = time_of_day_start)) + 
  geom_errorbar(aes(ymin=mean_duration-sd_duration, ymax=mean_duration+sd_duration), width=.1) +
  geom_line() +
  geom_point() + 
  scale_x_datetime(name = "Start Date", date_breaks = "day", date_labels = "%Y-%m-%d (%a)") +
  theme(axis.text.x = element_text(angle = 30, hjust = 1)) +
  labs(title = "mean duration of trips as per start date and time of day")

Observations on trips and their duration: * majority of trips end at different stations as they started (94.6%) * majority of trips are free of charge (86.6%) * mean trip duration is ~ 20 minutes

Observations on start date of trips: * Rentals at night increase especially at the weekend * Vice versa rentals in the morning decrease especially at the weekend * Most trips are started in the afternoon or evening

7 Interactive Visualization of start and end stations

start_trip_counts_latlon <- merge(start_trip_counts, stations_data, by.x = "start_rental_zone_hal_id", by.y = "station_id") %>% 
  mutate(rental_zone_id = start_rental_zone_hal_id, group = "Departure") %>% 
  select(-start_rental_zone_hal_id)

end_trip_counts_latlon <- merge(end_trip_counts, stations_data, by.x = "end_rental_zone_hal_id", by.y = "station_id") %>% 
  mutate(rental_zone_id = end_rental_zone_hal_id, group = "Arrival") %>% 
  select(-end_rental_zone_hal_id)

df_station_counts <- rbind(start_trip_counts_latlon, end_trip_counts_latlon)

# color mapping
# pie(rep(1:12), col=rainbow(12))
cols <- c(1,9)
factpal <- colorFactor(rainbow(12)[cols], df_station_counts$group)

m <- leaflet(df_station_counts) %>% 
  addTiles() %>% 
  addCircles(lng = ~lon, lat = ~lat, radius = ~sqrt(n)*10, 
             group = ~group, color = ~factpal(group),
             popup = ~name, weight = 1) %>% 
  addLayersControl(
    baseGroups = c("Departure", "Arrival"),
    options = layersControlOptions(collapsed = TRUE))
m

8 Interactive Visualization of trips

trips_df <- trips_data_fts %>% 
  group_by(start_rental_zone_hal_id, end_rental_zone_hal_id) %>% 
  summarise(n_trips = n()
            , n_dist_customer = n_distinct(customer_hal_id)
            , mw_duration = mean(timediff_mins)
            # , mean_speed = timediff_mins / (distance_meters / 1000)
            ) %>% 
  ungroup() %>% 
  mutate(freq_trips = n_trips / sum(n_trips), 
         freq_dist_customer = n_dist_customer / sum(n_dist_customer), 
         trips_per_customer = n_trips / n_dist_customer)

trips_lonlat <- merge(trips_df, stations_data, 
                      by.x = "start_rental_zone_hal_id", by.y = "station_id") %>% 
  rename(start_name = name, start_lat = lat, start_lon = lon) %>% 
  merge(x = ., stations_data, 
        by.x = "end_rental_zone_hal_id", by.y = "station_id") %>% 
  rename(end_name = name, end_lat = lat, end_lon = lon)
trips_lonlat$route_id <- 1:nrow(trips_lonlat)
# trips_lonlat = trips_lonlat %>% slice(1:1000)
trips_lonlat
lb <- 25
trips_lonlat_vis <- trips_lonlat %>% arrange(desc(n_trips)) %>% filter(n_trips >= lb)

m1 <- leaflet(trips_lonlat_vis) %>%
  addTiles() 

for(i in 1:nrow(trips_lonlat_vis)){
  popup_txt =  paste("Station (Departure):", trips_lonlat_vis[i,"start_name"], "<br>",
                               "Station (Arrival):", trips_lonlat_vis[i,"end_name"], "<br>", 
                               "Number of Trips:", trips_lonlat_vis[i,"n_trips"], "<br>", 
                               "Freq of Trips:", trips_lonlat_vis[i,"freq_trips"], "<br>", 
                               "Number of customers:", trips_lonlat_vis[i,"n_dist_customer"], "<br>",
                               "Freq of Customers", trips_lonlat_vis[i,"freq_dist_customer"], "<br>", 
                               "Mean Duration:", trips_lonlat_vis[i,"mw_duration"], "<br>",
                               "Trips per Customer:", trips_lonlat_vis[i,"trips_per_customer"], "<br>")

  m1 <- m1 %>%
    addPolylines(data = trips_lonlat_vis[i,],
                 lng = ~ c(start_lon, end_lon),
                 lat = ~ c(start_lat, end_lat),
                 popup = popup_txt,
                 weight = ~sqrt(n_trips-lb+1)*0.5)
}
m1

The map shows connections of station coordinates (directed trips) that appear at least 25 times. Size of lines increases in number of trips. Thus connections can be identified that are used more oftne. Right now, direction of trips (lon1,lat1) -> (lon2, lat2) and (lon2,lat2) -> (lon1, lat1) is distinguished i.e. some stations are connection by (duplicated) lines. Removing those duplicates is postponed. This is essential to identify the line that connects two station and appears most in data. To get a first result the map above can be used.

## TODO: count unique connections i.e. remove direction of trips and add visualization

9 Mining of osm data

https://github.com/jasmincl/correlaid-hackathon-spatialdata

## TODO: get min distance between two arbitrary stations
## TODO: mine osm data

[1] “booking_hal_id” “vehicle_hal_id” “customer_hal_id” “date_booking” “datetime_from” “datetime_to”
[7] “compute_extra_booking_fee” “traverse_use” “distance” “start_rental_zone” “start_rental_zone_hal_id” “end_rental_zone”
[13] “end_rental_zone_hal_id” “city_rental_zone” “date_from” “date_to” “hour_from” “hour_to”
[19] “timediff” “as.numeric(timediff)” “weekday_from” “weekday_to” “minute_from” “minute_to”
[25] “time_from” “time_to” “hourmin_from” “hourmin_to” “name_start_station” “lat_start_station”
[31] “lon_start_station” “name_end_station” “lat_end_station” “lon_end_station” “roundtrip” “timediff_hours”
[37] “timediff_mins” “time_of_day_start” “time_of_day_end”

10 Old Code